# set R to source file location
# To render log file run rmarkdown::render("replication_code.R", "html_document")

start <- Sys.time()
library(pacman)
p_load("tidyr", "lubridate", "sf", "lfe", "data.table", "stargazer", 
       "RColorBrewer", "dplyr", "zoo", "gsynth", "randomForest", 
       "verification", "tm", "stm",  "xtable", "ggplot2", "knitr", 
       "stringi", "stringr", "Formula", "mnormt")
set.seed(1)

# @knitr dpi=150, out.width="100%", fig.dim = c(10, 10), dev = "png"

#######################################
# Define new functions
#######################################

# Cutoffs for p-values
cutoff <- function(x, p){
x <- roundr(x, 1)
y <- c("^{***}", "^{**}", "^{*}", "")
p <- cut(p, breaks = c(0, 0.001, 0.01, 0.05, Inf), 
         include.lowest = TRUE, labels = y)
paste(x, p, sep = "")
}

# regression output processing
coef_out <- function(z, name = NULL, below = TRUE, dollar = FALSE, round = 1){
  w <- grep(name, names(coefficients(z)))
  names <- names(coefficients(z))[w]
  y <- matrix(coefficients(summary(z))[w, ], nrow = length(w))
  b <- roundr(y[, 1], round)
  p <- y[, 4]
  p <- ifelse(p < 0.001, "^{***}", ifelse(p < 0.01, "^{**}", ifelse(p < 0.05, "^{*}", "")))
  se <- paste("(", roundr(y[,2],  round), ")", sep="")
  s <- ifelse(dollar, "$", "")
  if(!below) return(data.table(var = names, b = paste(s, b, "~", se, p, s, sep = "")))
  if(below) return(rbind(b = paste0(s, b, p, s), se = se) %>% c() 
                   %>% data.table() %>% setnames("b") 
                   %>% data.table(var = rep(names, each = 2), type = rep(c("Coef", "SE"),  nrow(.)/2)))
}

# nicer rounding 
roundr <- function(x, n = 2) {
  f <- function(x, n) sprintf(paste("%.", n, "f", sep=""), round(x, n))
  if (is.null(dim(x))) out <- f(x, n)
  if (!is.null(dim(x))) out <- apply(x, 2, f, n = n)
  out
}

# Some functions produce a lot cat output, supress it
quiet <- function (..., messages = FALSE, cat = FALSE) 
{
  if (!cat) {
    sink(tempfile())
    on.exit(sink())
  }
  out <- if (messages) 
    eval(...)
  else suppressMessages(eval(...))
  out
}

##################################################
# Figure 1: Monument removals in time
##################################################
lenin <- readRDS("statues.RDS")
# Select post-soviet statues and remove Donbass
lenin <- lenin[cycle > 0 & !region %in% c("Donetskaya", "Luhanskaya")]
#  Select statues where exact removal dates are available
Lenin <- lenin%>%
  group_by(date_ymd)%>%
  summarise(r = n())%>%ungroup()%>%
  mutate(y = sum(r) - cumsum(r)) %>% na.omit() %>% data.table()
Lenin <- Lenin[date_ymd != ymd("2019-12-31")]

  e <- as.Date(c("2012-10-28", "2014-05-25", "2014-10-26", "2019-03-31", "2019-07-19"))
  # Breaks at each election
  y <- c(nrow(lenin), nrow(lenin) - cumsum(lapply(list(1, 2, c(3, 4), c(5, 6)), 
                      function(x) sum(lenin$cycle%in%x))%>%unlist()))
  g1 <- ggplot(Lenin, aes(x = as.Date(date_ymd), y = y)) + 
    geom_point(aes(size = r))  + geom_line() + theme_bw() + 
    theme(legend.position="top",
          legend.direction = "horizontal",
          legend.text=element_text(size = 14),
          legend.title = element_text(size = 14),
          panel.grid.major.y = element_blank(), 
          panel.grid.minor = element_blank(),
          axis.text=element_text(size = 12),
          axis.title.y = element_text(size = 14),
          axis.text.x=element_text(size = 12, face = "bold", color = "black", angle = 45, hjust = 1),
          axis.title=element_text(size = 12)) + xlab("") + 
    scale_size_continuous(breaks = c(1, 10, 50, 100)) + 
    labs(size = "Monuments removed (per day): ") + 
    scale_x_date(breaks = e, labels = as.yearmon(e)) + 
    scale_y_continuous(breaks = y[-4], limits = c(0, 1450)) + 
    ylab("Standing monuments")
  # Add the step function  
  s <- data.table(x = c(as.Date("2012-10-28"), e), y = c(y, last(y)))
  g1 + geom_step(data = s, aes(x = x, y = y))

##################################################
# Figure 2: Locations of the Soviet monuments
##################################################
# Load maps
ukr <- readRDS("UKR_adm1.RDS") %>% st_as_sf()
# Re-load statues
lenin <- readRDS("statues.RDS")
# Add geography
lenin <- st_as_sf(subset(lenin, !is.na(lon)), 
                  coords = c("lon", "lat"), 
                  crs = st_crs(ukr), 
                  agr = "constant")
names <- c("Removed before Euromaidan", "Removed during/after Euromaidan", "Standing / NA")
lenin$removed[lenin$cycle==0] <- names[1]
lenin$removed[lenin$cycle > 0 & lenin$cycle < 6] <- names[2] 
lenin$removed[lenin$cycle == 6 | is.na(lenin$cycle)] <- names[3]
lenin$removed <- factor(lenin$removed, levels = names)
cols <- c(brewer.pal(9, "Blues")[c(5, 9)], "#FF8000")
ggplot() + geom_sf(data = ukr, fill = "white", col = "black", lwd = 0.3, alpha = 0.1) + 
  coord_sf(crs = st_crs(ukr), datum = NA) + 
  geom_sf(data = lenin, aes(shape = removed, col = removed), alpha = .75, cex = 1.5) + 
  coord_sf(crs = st_crs(lenin), datum = NA) + 
  theme_minimal() + 
  scale_color_manual(name = "", values = c("black", "grey", "black")) + 
  scale_shape_manual(name = "", values = c(2, 17, 4)) + 
  theme(legend.position="top", legend.margin=ggplot2::margin(b=-20), legend.box.margin=ggplot2::margin(0,0,0,0)) + 
  guides(shape = guide_legend(override.aes = list(size=4)))

##################################################
# Data and main results
##################################################

# Data on elections and removals:
d <- readRDS("main_data.RDS")

# Figure 1 (Appendix A): Validating measures
d[d[election == ymd("2004-12-25")], on = "name", y_2004 := i.pro_russian]
d[d[election == ymd("2010-02-07")], on = "name", y_2010 := i.pro_russian]
D <- d[, lapply(.SD, function(x) cor(pro_russian, x, use = "complete")),
       by = election, .SDcols = c("y_2004", "y_2010")] %>% melt(id.vars = c("election"))

ggplot(data = D, aes(x = as.factor(election) %>% as.numeric(), y=value, color = variable)) + 
  geom_point(size = 2) + geom_line(size = 1) + 
  theme_bw() + 
  ylab("Correlation") + xlab("") + 
  scale_x_continuous(labels = d$election %>% unique() %>% sort() %>% as.yearmon(), breaks = 1:11) + 
  scale_color_manual(labels = c("2004 runoff (rerun)", "2010 runoff"), values = c("dark grey", "black")) + 
  scale_y_continuous(breaks = seq(0, 1, by = 0.1), lim = c(0, 1)) + 
  theme(
  legend.position="top", 
  legend.title = element_blank(), 
  legend.text=element_text(size = 14),
  panel.grid.minor = element_blank(),
  axis.text=element_text(size = 14),
  axis.text.x=element_text(size = 12, face = "bold", color = "black", angle = 45, hjust = 1),
  axis.title=element_text(size = 12)) 

# Remove 2019 presidential runoff
d <- d[election != ymd("2019-04-21")]
d[, cycle := which(election == unique(d$election)%>%ymd()%>%sort()) %>% as.numeric(), by = election]
d <- d[order(name, election)]
d[, time := as.numeric(cycle)]

# Select precincts that had any statues prior to Leninopad
d <- d[name %in% d[n > 0 & election == ymd("2012-10-28"), name]]
# Number of statues at time t
d[, n := ifelse(is.na(n), max(n, na.rm = TRUE), n), by = name]
# Number of statues removed by time t
d[, X := max(n) - n, by = name]

# List of models (specifications)
m <- Formula(y ~ X | name + election | 0 | name + oblast)
models <- list(
  m,
  update(m, . ~ . | . + time:oblast),
  update(m, . ~ . + time:(log(dist_to_kiev) + lon*lat + log(km_road)) | . + time:oblast + time:size),
  update(m, . ~ . | . + name:time)
)

# Use this to replicate estimation for each outcome
outcomes <- function(D, m){
  list(
    felm(update(m, turnout ~ .), data = D, weights = D$voters),
    felm(update(m, ru_turnout ~ .), data = D, weights = D$voters))
}

########################################################
# Table 1: Estimates from multi-period DiD regressions
########################################################

# This vector collects the estimates
z <- lapply(models, outcomes, D = d)

stargazer(z[[1]][[1]], z[[2]][[1]], z[[3]][[1]], z[[4]][[1]],
          style="apsr",
          label = "tab:main_votes",
          omit.table.layout = "n", omit.stat=c("rsq", "f", "ser"), omit=c("Constant"),
          float = T, digits = 1,
          title = "Diff-in-diff estimates combining all elections.", 
          add.lines = list(
            c("Precincts FE", rep("\\checkmark", 4)),
            c("Election FE", "\\checkmark", rep("", 3)),
            c("Oblast FE $\\times$ time", "", rep("\\checkmark", 3)),
            c("Covariates $\\times$ time", c("", "", "\\checkmark", "\\checkmark")),
            c("Precincts FE $\\times$ time", c("", "", "", "\\checkmark"))),
          covariate.labels = c("Effect on overall turnout"),
          dep.var.labels.include = FALSE,
          no.space = TRUE,
          keep = "X",
          star.cutoffs = c(.05, 0.01, 0.001))

stargazer(z[[1]][[2]], z[[2]][[2]], z[[3]][[2]], z[[4]][[2]],
          style="apsr",
          label = "tab:main_turnout",
          omit.table.layout = "n", omit.stat=c("rsq", "f", "ser"), omit=c("Constant"),
          float = T, digits = 1,
          title = "Diff-in-diff estimates combining all elections.", 
          add.lines = list(
            c("Precincts FE", rep("\\checkmark", 4)),
            c("Election FE", "\\checkmark", rep("", 3)),
            c("Oblast-election FE", "", rep("\\checkmark", 3)),
            c("Covariates", c("", "", "\\checkmark", "\\checkmark")),
            c("Time trend", c("", "", "", "\\checkmark"))),
          covariate.labels = c("Effect on pro-Soviet turnout"),
          dep.var.labels.include = FALSE,
          no.space = TRUE,
          keep = "X",
          star.cutoffs = c(.05, 0.01, 0.001)
)

## Time as factor (Table B.2. Appendix B.1):
d[, time := election]
z <- lapply(models, outcomes, D = d)

stargazer(z[[1]][[1]], z[[2]][[1]], z[[3]][[1]], z[[4]][[1]],
          style="apsr",
          label = "tab:main_votes_factor",
          omit.table.layout = "n", omit.stat=c("rsq", "f", "ser"), omit=c("Constant"),
          float = T, digits = 1,
          title = "Diff-in-diff estimates combining all elections.", 
          add.lines = list(
            c("Precincts FE", rep("\\checkmark", 4)),
            c("Election FE", "\\checkmark", rep("", 3)),
            c("Oblast FE $\\times$ time", "", rep("\\checkmark", 3)),
            c("Covariates $\\times$ time", c("", "", "\\checkmark", "\\checkmark")),
            c("Precincts FE $\\times$ time", c("", "", "", "\\checkmark"))),
          covariate.labels = c("Effect on turnout"),
          dep.var.labels.include = FALSE,
          no.space = TRUE,
          keep = "X",
          star.cutoffs = c(.05, 0.01, 0.001))

stargazer(z[[1]][[2]], z[[2]][[2]], z[[3]][[2]], z[[4]][[2]],
          style="apsr",
          label = "tab:main_turnout_factor",
          omit.table.layout = "n", omit.stat=c("rsq", "f", "ser"), omit=c("Constant"),
          float = T, digits = 1,
          title = "Diff-in-diff estimates combining all elections.", 
          add.lines = list(
            c("Precincts FE", rep("\\checkmark", 4)),
            c("Election FE", "\\checkmark", rep("", 3)),
            c("Oblast-election FE", "", rep("\\checkmark", 3)),
            c("Covariates", c("", "", "\\checkmark", "\\checkmark")),
            c("Time trend", c("", "", "", "\\checkmark"))),
          covariate.labels = c("Effect on pro-Soviet turnout"),
          dep.var.labels.include = FALSE,
          no.space = TRUE,
          keep = "X",
          star.cutoffs = c(.05, 0.01, 0.001)
)

## Recode "time" variable back to numeric, for later consistency
d[, time := as.numeric(cycle)]

## Two period specifications:

# Create two-period data
D_data <- function(elections){
  D <- copy(d)
  D <- D[election %in% ymd(elections)]
  D <- D[name %in% D[n > 0 & election %in% ymd(elections), name]]
  D <- D[order(name, cycle)]
  D[, n_start := max(n), by = name]
  D[, removed := 1*(n[election == ymd(elections[1])] > n[election == ymd(elections[2])]), by = name]
  D[, removed_any := 1*(n[1] > n[2]), by = name]
  D[, removed_all := 1*(n[2] == 0), by = name]
  D[, n_baseline := max(n, na.rm = TRUE), by = name]
  D[, removed_n := n_baseline - n, by = name]
  D[, removed_1 := 1*(n_baseline - n == 1), by = name]
  D[, removed_2 := 1*(n_baseline - n == 2), by = name]
  D[, post := 1*(election > ymd(elections[1])), by = name]
  D[, X := 1*(removed == 1 & post == 1)]
  D
}

x <- list(c("2012-10-28", "2014-05-25"),
          c("2014-05-25", "2014-10-26"),
          c("2014-10-26", "2019-03-31"),
          c("2014-10-26", "2019-07-21"))

out_each <- lapply(1:3, function(z) lapply(x, function(x) outcomes(D_data(x), m = models[[z]])))

# Full table (except the first election) for appendix and for paper for model 3
tab_out <- function(j){
  tab <- lapply(1:2, function(i) lapply(out_each[[j]], function(z) coef_out(z[[i]], name = "X", dollar = TRUE, below = FALSE))%>%unlist())%>%do.call("cbind", .)
  tab <- tab[ which(row.names(tab) %in% "b"),]
  # Which election is compared?
  e <- lapply(x, function(z) paste(as.yearmon(z), collapse = " - ")) %>% unlist()
  # Precincts per analysis
  n <- lapply(out_each[[j]], function(x) length(unique(x[[1]]$fe$name)))%>%unlist()%>%prettyNum(big.mark = ",")
  data.frame(e, tab, n)
}

#######################################
# Table 2: Two-period DiD Regressions
#######################################
tab_out(3) %>% kable()

#######################################
# Table B.3. Appendix B.2: 
#######################################
lapply(1:2, tab_out) %>% rbindlist() %>% kable()

## Save the first pair as a separate dataset:
D <- D_data(c("2012-10-28", "2014-05-25"))

############################################
# Table 3: Decomposition of the DiD effects
############################################
dd_table <- function(felm_out){
  D[, Y := felm_out$fitted.values]
  c1 <- D[removed_any == 0 & post == 0, mean(Y)]
  c2 <- D[removed_any == 0 & post == 1, mean(Y)]
  t1 <- D[removed_any == 1 & post == 0, mean(Y)]
  t2 <- ((t1 + coefficients(felm_out)["X"] + (c2 - c1)))
  tab <- data.table(c1, c2, c.delta = c2-c1, t1, t2, t.delta = t2-t1, delta = (t2-t1) - (c2-c1))
  tab[, lapply(.SD, roundr, n = 1)][]
}

lapply(out_each[[3]][[1]], dd_table) %>% 
  rbindlist() %>% kable()

######################################
# Table B.5. Appendix B.4:
######################################
outcomes_w <- function(D, m){
  list(
    felm(update(m, turnout ~ .), data = D),
    felm(update(m, ru_turnout ~ .), data = D))
}

out_each <- lapply(1:3, function(z) lapply(x, function(x) outcomes_w(D_data(x), m = models[[z]])))

lapply(1:3, tab_out) %>% rbindlist() %>% kable()

#############################################################
#Table B.4. Appendix B.3 (Alternative definitions of treatment)
#############################################################

z <- list(
  outcomes(D, m = update(models[[3]], . ~ . - X + removed_all:post)),
  outcomes(D, m = update(models[[3]], . ~ . - X + removed_n:post)),
  outcomes(D, m = update(models[[3]], . ~ . - X + post:(removed_1 + removed_2))),
  outcomes(D, m = update(models[[3]], . ~ . - X + I(removed_n/n_start):post))
)

tab_out <- function(){
  vars <- c("Pro-Soviet", "Turnout")
  tab <- lapply(seq_along(z), function(j){
    lapply(1:2, function(i) data.table(coef_out(z[[j]][[i]], name = "removed", dollar = TRUE, below = TRUE), outcome = vars[[i]])) %>% rbindlist() %>% dcast(var + type ~ outcome, value.var = "b")})
  rbindlist(tab)
}

tab <- tab_out()
tab[grepl("all", var), var := "All monuments removed"]
tab[grepl("removed_n:post", var), var := "Number of removed monuments"]
tab[grepl("removed_1", var), var := "One monument removed"]
tab[grepl("removed_2", var), var := "Two monuments removed"]
tab[grepl("removed_n/n_start", var), var := "Percentage of monuments removed"]
tab[type == "SE", var := ""][, type := NULL]

kable(tab)

#############################################################
# Table B.6. Appendix B.6 (Floor effects)
#############################################################
q <- D[post==1, quantile(ru_turnout, c(.25, .5))]
out_robust <- list(
  outcomes(D = D, m = models[[3]]),
  outcomes(D = D[ name %in% D[post==1 & ru_turnout >= q[1], name]], m = models[[3]]),
  outcomes(D = D[ name %in% D[post==1 & ru_turnout >= q[2], name]], m = models[[3]])
)

tab <- lapply(1, function(i) lapply(out_robust, function(z) coef_out(z[[i]], name = "X", below = FALSE, dollar = TRUE))) %>% first() %>% rbindlist() %>% subset(select = "b")
n <- lapply(out_robust, function(x) x[[1]]$N/2)%>%unlist()%>%prettyNum(big.mark = ",")
e <- c("All precincts", paste("Pro-Soviet turnout above", roundr(q, 1), "\\% (", c("25th", "50th"), "percentile)"))

data.table(e, tab, n) %>% kable()

################################################################
# Figure 2. Appendix B.5: Sensitivity to exclusion of oblasts
#################################################################
D <- D_data(c("2012-10-28", "2014-05-25"))
oblast_plot <- lapply(unique(D$oblast), function(x) outcomes(D[!oblast %in% x], models[[3]]) %>% lapply(., function(y) data.table(coefficients(y)["X"],  confint(y, "X"))%>%setnames(c("b", "l", "u")))%>%rbindlist()%>%cbind(c("Pro-Soviet turnout", "Overall turnout"), x))%>%rbindlist()
oblast_plot[, x := stri_trans_general(x, "ukrainian-latin/BGN") %>% gsub("oblast", "", .) %>% str_trim()  %>% gsub("·|ʹ", "", .)]

ggplot(data = oblast_plot, aes(color = V2, x = factor(x)), color = c("black", "grey")) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  geom_point(aes(y = b), size = 3, position = position_dodge(width = 1/2)) + 
  geom_errorbar(aes(ymin = l, ymax = u), width=.2, size = 2, position = position_dodge(width = 1/2)) + 
  theme_bw() + ylab("Diff-in-diff coefficient") + xlab("") + 
  scale_color_manual(values=c("dark grey", "black")) + 
  coord_flip() + theme(
    legend.position="top", 
    legend.title = element_blank(), 
    legend.text=element_text(size = 14),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    axis.text=element_text(size = 14),
    axis.text.x=element_text(size = 12, face = "bold", color = "black"),
    axis.title=element_text(size = 12))

#############################################################
# Figure 3. Appendix C:
#############################################################

p_load(gridExtra)
D[, ru_share := 100*(pro_russian/votes)]
grid.arrange(
  ggplot(data = D[, list(turnout = turnout[time == 7] - turnout[time == 6], ru_turnout = ru_turnout[time == 7] - ru_turnout[time == 6]), by = name], aes(x = turnout, y = ru_turnout)) + 
    geom_point(alpha = 0.5) + geom_smooth(col = "dark red", method = "lm") + 
    ylab("") + xlab("Change in overall turnout") + 
    ggtitle("Change in pro-Soviet turnout")  + theme_minimal(),
  ggplot(data = D[, list(turnout = turnout[time == 7] - turnout[time == 6], ru_share = ru_share[time == 7] - ru_share[time == 6]), by = name], aes(x = turnout, y = ru_share)) + 
    geom_point(alpha = 0.5) + geom_smooth(col = "dark red", method = "lm") + 
    xlab("Change in overall turnout") + ylab("") + 
    ggtitle("Change in pro-Soviet vote-percentage") + theme_minimal(), ncol  = 2)

#############################################################
# Table C.8. Appendix C.2: Effects on other parties
#############################################################
nat_out <- list(
  felm(update(models[[3]], I(100*nationalist/voters) ~ .), data = D, weights = D$voters),
  felm(update(models[[3]], I(100*(votes - pro_russian - nationalist)/voters) ~ .), data = D, weights = D$voters),
  felm(update(models[[3]], I(100*(votes - pro_russian)/voters) ~ .), data = D, weights = D$voters)
)

tab <- cbind(
  c("Effect on nationalist turnout", 
    "Effect on turnout for parties other than pro-Soviet", 
    "Effect on turnout for centrist parties"),
  lapply(nat_out, coef_out, name = "X", below = FALSE, dollar = TRUE) %>% rbindlist())

kable(tab[,-2])

####################################################
# Figure 3: Generalized synthetic control analysis
####################################################

# Which units have been treated between 2012 and 2014?
d[, treated := (name %in% d[, (n[election ==  ymd("2014-05-25")] < n[election == ymd("2012-10-28")]), by = "name"][V1 == TRUE, name])]
# Standard diff-in-diff treatment indicator:
d[ , D := 1*(treated == TRUE & election > ymd("2012-10-28"))]
# Indicator for presidential election
d[, presidential := election %in% ymd("2004-12-25", "2010-01-17", "2010-02-07", "2014-05-25", "2019-03-31")]
# Rescale turnout to remove the saw-chain pattern
d[, turnout_m := mean(turnout), by = presidential][, turnout_m := turnout - turnout_m + mean(turnout)]

# Fit gsynth
m <- list("turnout_m", "ru_turnout")
g_out <- lapply(m, function(m) quiet(gsynth(Y = m, D = "D", weight = "voters", index = c("name","cycle"), data = d, force = "two-way", se = FALSE, estimator = "mc", seed = 1, tol = 1e-6)))

## Bootstrap cluster-corrected p-values
boot_p <- function(out, M){
  g <- function(out){
    # Data
    D <- d[!name %in% out$tr.remove.id, ]
    # Sample regions
    r <- D[, list(oblast = sample(unique(oblast), replace = TRUE))]
    # Track bootstraps
    r[, oblast_boot := .I]
    # Sample units within each sampled region
    f <- function(r) D[oblast == r, sample(unique(name), replace = TRUE)]
    # Replications per unit
    r <- r[, list(name = f(oblast)), by = .(oblast, oblast_boot)][, .(name)][, fake_name := paste(name, 1:.N, sep = "-"), by = name]
    # Boostrapped data
    D <- merge(r, D, by = "name", allow.cartesian = TRUE)
    # gsynth estimation on bootstrapped data
    gsynth(Y = out$Y, D = "D", weight = "voters", index = c("fake_name","cycle"), data = D, force = "two-way", estimator = "mc", seed = 1, tol = 1e-6, lambda = out$lambda.cv, CV = FALSE)$att
  }
  t(replicate(M, g(out)))
}

invisible(quiet(b <- lapply(g_out, boot_p, M = 100)))
     
# Boostrapped p-values based on T(G-1) distribution with finite 
# sample correction following Cameron / Gehlbach / Miller (2008)
Z  <- d[!name %in% g_out[[1]]$tr.remove.id, ]
G  <- length(unique(Z$oblast))
# Paremeters = # of precincts + # time effects + # coefficients
K  <- length(unique(Z$name)) + 2*length(unique(Z$cycle))
N  <- nrow(Z)
# Adjust S.E. for the degrees of freedom (Cameron/Gehlbach/Miller)
se <- lapply(b, function(x) apply(x, 2, sd)*sqrt(G/(G-1)*(N-1)/(N-K)))
p <- lapply(1:2, function(i) 2*pt(abs(g_out[[i]]$att/se[[i]]), df = G - 1, lower.tail = FALSE))
# Bonferroni adjustment
p <- lapply(p, p.adjust, method = "bonferroni")

g_plot <- function(i) data.table(outcome = m[[i]], 
                                 x = 1:length(unique(d$election)), 
                                 g_out[[i]]$Y.bar, 
                                 ATT = g_out[[i]]$att,
                                 p = p[[i]]
)
d_out <- lapply(1:2, g_plot) %>% rbindlist() %>% melt(measure.vars = grep("Y\\.", colnames(.), value = TRUE))

d_out[, att_lab := ""]
d_out[, att_lab := cutoff(ATT, p)]
d_out[, outcome := ifelse(grepl("ru", outcome), "Pro-Soviet turnout", "Overall turnout") %>% as.factor()]
d_out[variable == "Y.tr.bar", group := "Treated"]
d_out[variable == "Y.ct.bar", group := "Synthetic control"]
d_out[variable == "Y.co.bar", group := "Observed control"]
d_out[, group := relevel(factor(group), ref = "Treated")]

ggplot(d_out, aes(x = x, y = value, linetype = group, color = group, size = group)) + 
  geom_line(size = 1.5) + geom_point(size = 2.5) + 
  scale_x_continuous(breaks = 1:length(unique(d$election)), labels = unique(d$election) %>% ymd() %>% sort() %>% as.yearmon(), expand = expansion(c(0.05, 0.05))) + 
  facet_wrap(. ~ relevel(outcome, ref = "Overall turnout"), scales = "free_y") + theme_bw() + theme(
  aspect.ratio = 1/2,
  axis.text.x = element_text(size=10, angle=45, vjust = 1, hjust = 1, color = "black"),
  panel.grid.minor = element_blank(),
  panel.grid.major = element_blank(),
  legend.position="bottom",
  legend.text = element_text(size = 12, colour = "black"),
  legend.title = element_blank(),
  strip.text.x = element_text(size = 12, face = "bold"),
  legend.key.width = unit(5, "lines")
) + xlab("") + ylab("") + 
  geom_vline(xintercept = which(unique(d$election) %>% sort() == "2012-10-28"), col = "dark red", alpha = 0.1, size = 2) + 
  geom_text(aes(label = att_lab, y = -Inf), vjust = -1, size = 3.2) + scale_y_continuous(expand = expansion(c(0.15, 0.05))) + 
  scale_linetype_manual(values = c(1, 2, 2)) + 
  scale_color_manual(values = c("black", "black", "grey")) + 
  scale_size_manual(values = c(1.5, 1, 1)) + 
  guides(linetype = guide_legend(override.aes = list(size = 2.25)), 
         color = guide_legend())

########################################
# Figure 4: A falsification test
########################################
# Clean up (load the dataset anew)
d <- readRDS("main_data.RDS")

D_f <- function(d, elections){
  # Select precincts that have any statues at the start of the "cycle":
  D <- d[name %in% d[n > 0 & election %in% ymd(elections), name], ]
  # Indicator for treated unit
  D[, treated := (n[election == elections[1]] > n[election == elections[2]]), by = name]
  D[, cycle := as.numeric(factor(election))]
  D[, (c("oblast", "election", "name", "size")) := lapply(.SD, as.factor), .SDcols = c("oblast", "election", "name", "size")]
  D[]
} 

e <- d[, unique(election) %>% ymd() %>% sort()]

out_b <- function(model){
  D <- D_f(d, elections = c("2012-10-28", "2014-05-25"))
  x <- rep(1:length(e), each = 2)
  x <- matrix(x[-c(1, length(x))], nrow = 2)
  lapply(1:ncol(x), function(z){
    dd <- D[cycle %in% x[,z]][, post := 1*(cycle == max(cycle))][, X := post*treated][, t:=cycle]
    outcomes(D = dd, m = model) %>% lapply(function(b) data.table(outcome = b$lhs, election = e[x[2, z]], b = coefficients(b)["X"], l = confint(b, "X", level = 1 - 0.05/10)[1], u = confint(b, "X", level = 1 - 0.05/9)[2])) %>% rbindlist()
  }) %>% rbindlist()
}

b <- out_b(model = models[[3]])

ggplot(data = b, aes(color = factor(outcome, levels = c("turnout", "ru_turnout")), x = factor(election))) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  geom_vline(xintercept = which(e[-1]=="2014-05-25"), lwd = 25, color = "dark red", alpha = 0.2) + 
  geom_point(aes(y = b), size = 3, position = position_dodge(width = 1/3)) + 
  geom_errorbar(aes(ymin = l, ymax = u), width = .2, size = 2, position = position_dodge(width = 1/3)) + 
  theme_bw() + xlab("") + ylab("") + 
  scale_x_discrete(labels = as.yearmon(e[-1])) + 
  scale_color_manual(labels = c("Overall turnout", "Pro-Soviet turnout"), values=c("dark grey", "black")) + 
  theme(
    legend.position="top", 
    legend.title = element_blank(), 
    legend.text=element_text(size = 14),
    panel.grid.major.x = element_blank(), 
    panel.grid.minor.x = element_blank(),
    axis.text=element_text(size = 14),
    axis.text.x=element_text(size = 12, color = "black", angle = 45, hjust = 1),
    axis.title=element_text(size = 12))

#######################################################
#Figure 5: Estimates for different types of removals
#######################################################

# Create two-period data
D_het <- function(elections){
  setDT(lenin)
  D <- D_data(elections)
  # Check the type of removal by identifying which statue has disappeared:
  which_ids <- function(x1, x2) setdiff(strsplit(x1, ";") %>% unlist(), strsplit(x2, ";") %>% unlist())
  D[removed_any == 1, removed_by_activists := any(grepl("activists", lenin[id %in% which_ids(statue_id[1], statue_id[2]), removedby])), by = "name"]
  D[removed_any == 0, removed_by_activists := FALSE]
  # Count only removals within X months prior to a given election:
  D[removed_any == 1, removed_days_before := (max(ymd(election))  - ymd(max(lenin[id %in% which_ids(statue_id[1], statue_id[2]), date_ymd])))%>%as.numeric(), by = name]
  D[is.na(removed_days_before), removed_days_before := 0]
  D
}

out_each <- list(
  lapply(x, function(x) outcomes(D_het(x)[, X := 1*(removed_any == 1 & post == 1 & removed_days_before <= 120)], m = models[[3]])),
  lapply(x, function(x) outcomes(D_het(x)[, X := 1*(removed_any == 1 & post == 1 & removed_by_activists == TRUE)], m = models[[3]])),
  lapply(x, function(x) outcomes(D_het(x)[, X := 1*(removed_any == 1 & post == 1 & removed_by_activists == FALSE)], m = models[[3]]))
)

# Which elections are being compared?
els <- lapply(x, function(z) paste(as.yearmon(z), collapse = " - ")) %>% unlist()
# How treatment is defined?
removed_by <- c("Removed within four \n months before election", 
                "Removed by protesters", 
                "Removed by government")

cofs <- lapply(1:3, function(i) data.table(lapply(out_each[[i]], function(z) lapply(z, function(z) data.table(b = coefficients(z)["X"], confint(z, "X"))) %>% rbindlist()  %>% setnames(c("b", "l", "u")) %>% data.table (y = c("Overall turnout", "Pro-Soviet turnout"), .)) %>% rbindlist() %>% data.table(e = rep(els, each = 2), .), removed_by = removed_by[[i]])) %>% rbindlist()

ggplot(data = cofs, aes(color = y, x = factor(e, rev(els)))) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  geom_point(aes(y = b), size = 3, position = position_dodge(width = 1/3)) + 
  geom_errorbar(aes(ymin = l, ymax = u), width=.2, size = 2, position = position_dodge(width = 1/3)) + 
  theme_bw() + ylab("") + xlab("") +
  scale_color_manual(values=c("dark grey", "black")) + 
  facet_grid(. ~ factor(removed_by, levels = unique(cofs$removed_by))) + 
  coord_flip() + 
  theme(
  legend.position="bottom", 
  legend.title = element_blank(), 
  legend.text=element_text(size = 14),
  panel.grid.major = element_blank(), 
  panel.grid.minor = element_blank(),
  axis.text=element_text(size = 12),
  axis.text.x=element_text(size = 12),
  axis.title=element_text(size = 12),
  strip.text = element_text(size=12, face = "bold")
)


###################################################################
# The Role of Protests
####################################################################

# Load UIK polygons from 2019 and use as the main template
uik <- readRDS("uik_polygons.RDS")

###################################################################
# Figure 4. Appendix D.1:
###################################################################

# Protest data from media (expand to by-day format)
protest <- readRDS("protests.RDS")
ggplot() + geom_sf(data = ukr, fill = "white", lwd = 0.1, alpha = 0.1) + 
  coord_sf(crs = st_crs(ukr), datum = NA) + 
  geom_sf(data = protest, alpha = .75, pch = 16, color = "dark red") + 
  coord_sf(crs = st_crs(protest), datum = NA) + 
  theme_minimal()
# Merge protest data with other geographies
protest <- st_join(uik[, c("name", "ADM2_PCODE", "ADM3_PCODE")], protest) %>% st_set_geometry(NULL) %>% data.table()

# Load twitter data
twitter <- readRDS("twitter.RDS")
# Map on the uik template (merge by unique coordinate and then map back, to save computation time)
D <- twitter[, .(lon, lat)] %>% unique() %>% st_as_sf(coords = c("lon", "lat"), crs = st_crs(uik), remove = FALSE)
D <- st_join(D, uik[, c("name", "ADM2_PCODE", "ADM3_PCODE")]) %>% st_set_geometry(NULL)
# Map back the merged attributes on the main twitter frame
twitter <- twitter[D, on = c("lon", "lat")]

########################################
# Figure 5. Appendix D.2: 
#########################################
D <- twitter[, .N, by = .(lon, lat, includes_maidan)]
D <- st_as_sf(D, coords = c("lon", "lat"), crs = st_crs(ukr))
cols <- c(brewer.pal(9, "Blues")[6], brewer.pal(9, "Oranges")[6])
ggplot() + geom_sf(data = ukr, fill = "white", lwd = 0.1, alpha = 0.1) + 
  coord_sf(crs = st_crs(ukr), datum = NA) + 
  geom_sf(data = D, aes(color = factor(includes_maidan)), alpha = 1, cex = 0.2, pch = 16) + 
  coord_sf(crs = st_crs(D), datum = NA) + 
  theme_minimal() + 
  scale_color_manual(name = "", values = cols, labels = c("All tweets", "Tweets on Euromaidan")) + 
  theme(legend.position="bottom", legend.key.size = unit(3,"line")) + 
  guides(color = guide_legend(override.aes = list(size = 4)))

# This function that merges datasets and produces DiD estimates at the level given by "unit" value; "variables" are measures of protest

z_out <- function(unit = "ADM3_PCODE", variables = NULL){
  
  # Aggregating protests over the revelant units
  p <- protest[ , c(unit, "start_date", "end_date"), with = FALSE]
  p <- p[!is.na(start_date), list(date = seq(mdy(start_date)[1], mdy(end_date)[1], by = 1)), by = unit][, protest := 1] %>% setnames(unit, "unit")
  
  # Prepare election data for cycles 5:6
  d <- readRDS("main_data.RDS")
  d <- d[election %in% ymd(c("2012-10-28", "2014-05-25"))]
  # Aggregate election/monument data to unit level
  d[, unit := get(unit)]
  d <- d[, list(
    voters = sum(voters),
    votes = sum(votes),
    turnout = 100*sum(votes)/sum(voters),
    ru_turnout = 100*sum(pro_russian)/sum(voters),
    statues = sum(n),
    dist_to_kiev = mean(dist_to_kiev),
    size_small = mean(size == "mala"),
    size_large = mean(size == "velyka"),
    lon = mean(lon),
    lat = mean(lat),
    km_road = mean(km_road),
    post = 1*(election == "2014-05-25")
  ), by = .(ADM1_UA, unit, election)]
  d[, statue_exists := 1*(unit %in% d[statues > 0 & election == "2012-10-28", unit])]
  d[, statue_removed := 1*(statues[1] > statues[2]), by = unit]
  
  ############################################
  # Merge & analyze (predict protests by day)
  ############################################
  
  # Twitter data by unit-date
  D <- twitter[!is.na(get(unit)), list(
    tweets = .N, 
    tweets_followers = sum(followers_count)), by = c(unit, "includes_maidan", "date")]
  D[ , maidan := ifelse(includes_maidan == 0, "all", "maidan")]
  D <- dcast(D, get(unit) + date ~ maidan, value.var = c("tweets", "tweets_followers"), fill = 0)
  
  # Create unit-day template and merge twitter and protest data on the top of it
  data <- CJ(date = seq(ymd("2013-12-01"), ymd("2014-02-24"), by = 1), unit = D[, unit] %>% unique())
  data <- merge(data, D, by = c("unit", "date"), all.x = TRUE)
  data[p, on = c("unit", "date"), protest := i.protest]
  cols <- c("tweets_all", "tweets_maidan", "tweets_followers_all", "tweets_followers_maidan", "protest") 
  data[, (cols) := lapply(.SD, function(x) replace_na(x, replace = 0)), .SDcols = cols]
  
  cols <- grep("tweets", colnames(data), value = TRUE)
  
  # Design matrix for predictions
  cat("Fitting random forest", '\n')
  rf_out <- randomForest(factor(protest) ~ tweets_all + tweets_maidan + tweets_followers_all + 
                         tweets_followers_maidan, data = data,
                 replace = TRUE, 
                 strata = data$protest, 
                 sampsize = c(sum(data$protest), sum(data$protest)),
                 ntree = 1000)
  data[, protest_predict := predict(rf_out, type = "prob")[,2]]
  cat("Area under ROC = ", roc.area(data$protest, data$protest_predict)$A, "\n")
  cat("Accuracy = ", (data[protest==1, sum(protest_predict >= 1/2)] + data[protest==0, sum(protest_predict < 1/2)])/nrow(data), "\n")
  
  p_data <- data[, list(
    protests_predicted = log(1 + sum(protest_predict > 1/2)),
    tweets_all = log(1 + sum(tweets_all)),
    tweets_maidan = log(1 + sum(tweets_maidan)),
    tweets_followers_all = log(1 + sum(tweets_followers_all)),
    tweets_followers_maidan = log(1 + sum(tweets_followers_maidan))
  ), by = unit] %>% melt(id.vars = c("unit", "protests_predicted"))
  
  # Figure 6. Appendix D.2:
  print(ggplot(p_data, aes(x = value, y = protests_predicted)) + 
          geom_point(alpha = 0.1, col = "dark blue") + 
          geom_smooth(color = "black") + 
          facet_wrap(. ~ variable, scales = "free") + 
          theme_bw() + ylab("Predicted protests (log scale)") + 
          xlab("Feature value (log scale)"))
  
  # Now map predictions to the the complete dataset of unit-by-day:
  D <- merge(CJ(date = unique(data$date), unit = unique(d[, unit])), data, by = c("unit", "date"), all.x = TRUE)
  cols <- grep("tweets|protest", colnames(D), value = TRUE)
  D[, (cols) := lapply(.SD, function(x) replace_na(x, replace = 0)), .SDcols = cols]
  X <- D[,grep("tweets", colnames(data), value = TRUE), with = FALSE]
  D[, protest_predict := predict(rf_out, newdata = D, type = "prob")[,2]]
  
  # Aggregate twitter and protest data to unit level and merge to election/monument data
  data[, protest_cut := protest_predict > 0.5]
  data <- merge(d, D[, lapply(.SD, sum), 
                     by = unit, 
                     .SDcols = grep("protest|tweets", names(D), value = TRUE)], 
                by = "unit", all.x = TRUE)
  cat("Units with none observed = ", data[election == "2012-10-28"][protest == 0, .N], "\n")
  cat("Predict protests where none observed = ", data[election == "2012-10-28"][protest == 0, sum(protest_predict > 0)], "\n")
  
  # Use only councils with two time periods (removes one observation):
  data <- data[unit %in% data[, .N, by = unit][N==2, unit]] 
  
  # Regressions
  m_out <- list()
  
  for (i in variables){
    data[, Z := get(i)]
    m <- Formula(y ~ post:(statue_removed  + log(1 + Z) + log(dist_to_kiev) + 
                             lon*lat + size_small + size_large + 
                             log(km_road)) | unit + factor(election)*factor(ADM1_UA) | 0 | unit + ADM1_UA)
    m_out[[i]] <-  list(
      outcomes(D = data[statue_exists == 1], m = m),
      outcomes(D = data[statue_exists == 0], m = update(m, . ~ . - post:statue_removed)))
  }
  
  attr(m_out, "unit") <- unit
  attr(m_out, "variables") <- variables
  
  m_out
  
}

################################################
# Protest/twitter
################################################

tab_out <- function(z, Unit = ""){
  unit <- attr(z, "unit")
  variables <- attr(z, "variables")
  tab <- lapply(1:3, function(k){
    lapply(1:2, function(i){lapply(z[[k]][[i]], function(x){
      b <- coef_out(x, name = "removed|Z|tweets", dollar = FALSE, below = TRUE)
      data.table(outcome = attributes(x$response)[[2]][[2]], b, N = prettyNum(x$N, big.mark=",",scientific=FALSE), subset = ifelse(i==1, "With monuments", "Without monuments"))}) %>% 
        rbindlist()}) %>% rbindlist() %>% data.table(., Z = variables[k])}) %>% rbindlist()
  tab[, outcome := ifelse(grepl("ru_turnout", outcome), "Pro-Soviet turnout", "Overall turnout") %>% factor(levels = c("Overall turnout", "Pro-Soviet turnout"))]  
  tab[, var := ifelse(grepl("Z", var), "Protests", "Removals")]
  tab[, Z:=factor(Z, levels = variables)]
  tab <- dcast(tab, Z + var + type ~ subset + outcome, value.var = c("b"))
  tab[type == "SE", var := ""]
  n <- lapply(1:2, function(x) z[[1]][[x]][[1]]$N) %>% unlist() %>% prettyNum(big.mark=",",scientific=FALSE)
  tab[,-c(1,3)]
}

vars <- c("protest", "tweets_followers_maidan", "protest_predict")

########################################################
# Table 4: Protest as an alternative mechanism
########################################################
z <- z_out(unit = "ADM3_PCODE", variables = vars) %>% tab_out(Unit = "Councils")
kable(z[,1:3])
kable(z[,c(1, 4:5)])

########################################################
# Table D.9. Appendix D.3:
########################################################
z <- z_out(unit = "ADM2_PCODE", variables = vars) %>% tab_out(Unit = "Rayons")
kable(z[,1:3])
kable(z[,c(1, 4:5)])

########################################################
# Structural topic analysis
########################################################
l <- readRDS("stm_text.RDS")
processed <- textProcessor(l$text, metadata = data.frame(l))
out <- prepDocuments(processed$documents, processed$vocab, processed$meta) 

st_out <- stm(documents = out$documents, 
              vocab = out$vocab, 
              K = 3, 
              prevalence = ~ factor(period), 
              max.em.its = 75, 
              data = out$meta, 
              init.type = "Spectral", 
              seed = 1, 
              verbose = FALSE)

# Simulate 
simBetas <- function (parameters, nsims = 1000){
  simbetas <- list()
  for (i in 1:length(parameters)){
    simbetas[[i]] <- do.call(rbind, lapply(parameters[[i]], function(x) rmnorm(n = nsims, mean = x$est, varcov = x$vcov)))
  }
  apply(simbetas[[1]], 2, quantile, c(.025, .5, .975)) %>% t()
}

# Topic words
words <- apply(labelTopics(st_out)$prob, 1, paste, collapse = ", ")
en_words <- c("Russia, country, war, Soviet, Russian, power, state", 
              "conversation, first, people, new, film, book", 
              "monument, city, Donetsk, places, local, Kyiv, people")
words <- paste(c("{\\bf Sovereignty}", "{\\bf Culture}", "{\\bf Protest}"), en_words, sep = ": ")

st_figure <- lapply(1:3, function(i){
  st_reg <- estimateEffect(1:3 ~ -1 + factor(period), st_out,  meta = out$meta, uncertainty = "Global")
  data.table(period = 1:3, topic = words[i], simBetas(parameters = st_reg$parameters[i]))
}) %>% rbindlist() %>% setnames(c("period", "topic", "l", "b", "u"))
st_figure[, topic := factor(topic, levels = words[3:1])]

#################################
#Figure 6: The meaning of Lenin
#################################

ggplot(st_figure, aes(x = period, y = b, color = topic, shape = topic)) + 
  geom_point(position = position_dodge(width = 1/10), size = 4) + 
  geom_line(position = position_dodge(width = 1/10)) +
  geom_errorbar(aes(ymin = l, ymax = u), width=.01, size = 2, 
                position = position_dodge(width = 1/10)) + 
  scale_x_discrete(limits = factor(c(1:3)), 
                   labels = c("Dec 2013-May 2014", "May 2014-Oct 2014", "Oct 2014-March 2019")) + 
  xlab("") + ylab("Topic probability") + 
  theme_bw() + 
  scale_color_manual(values=c("#000000", "#E69F00", "#56B4E9")) + 
  theme(
  legend.position="bottom", 
  legend.title = element_text(size = 12), 
  legend.text=element_text(size = 12), 
  panel.grid.minor = element_blank(),
  panel.grid.major.x = element_blank(),
  panel.grid.major.y = element_line(linetype = "dotted", color = "black", size = .25),
  axis.text=element_text(size = 12),
  axis.text.x=element_text(size = 12, color = "black", angle = 0, hjust = .5),
  axis.title=element_text(size = 12)) 

# This is to extract the point estimates reported in the text (not tables or figures)
est_b <- estimateEffect(1:3 ~ factor(period), st_out,  meta = out$meta, uncertainty = "Global") %>% summary()
est_b$tables

# How long did it take?
round(Sys.time() - start, 1)
